Putting Visual Analytics into Practical Use
We are required to attempt one of the three questions under Challenge 3 of VAST Challenge 2022 on the economic of the city of Engagement, Ohio USA by using appropriate static and interactive statistical graphics methods.
With that, I have chosen #3 which is to:
Describe the health of the various employers within the city limits. What employment patterns do you observe? Do you notice any areas of particularly high or low turnover? Limit your response to 10 images and 500 words.
About 1000 representative residents have volunteered to provide data using the city’s urban planning app, which records the places they visit, their spending, and their purchases, among other things; totaling to 3 dataset folders (Activity Logs, Attributes, Journals). For this exercise, we refers to Empolyers that offers jobs as stipulated in the Jobs dataset; we thus selected the dataset Employers, Buildings, Jobs from the Attributes folder, and CheckinJournal from the Journals folder which give details on the employers and jobs.
The packages tidyverse (including dplyr, ggplot2, patchwork), ggrepel, lubridate, gapminder, gganimate,ggiraph will be used for the purpose of this exercise:
The code chunk below is used to install and load the required packages onto RStudio.
packages = c('tidyverse','treemap','ggrepel','lubridate','gapminder','gganimate','ggiraph','plotly')
for(p in packages){
if(!require(p, character.only =T)){
install.packages(p)
}
library(p, character.only =T)
}
The code chuck below import Employers.csv,
Buildings.csv, Jobs.csv and
CheckinJournal.csv from the data folder into R by using
read_csv() and save it as an tibble data frame.
Employers <- read_csv("data/Employers.csv")
Buildings <- read_csv("data/Buildings.csv")
Jobs <- read_csv("data/Jobs.csv")
Checkin <- read_csv("data/CheckinJournal.csv")
Participants <- read_csv("data/Participants.csv")
polygon(x = c(0.7, 1.3, 1.2, 0.8), # X-Coordinates of polygon y = c(0.6, 0.8, 1.4, 1), # Y-Coordinates of polygon col = “#1b98e0”)
First, let’s get a general sense of our data using the function
summary().
summary(Employers)
employerId location buildingId
Min. : 379 Length:253 Min. : 3.0
1st Qu.: 829 Class :character 1st Qu.: 261.0
Median :1279 Mode :character Median : 486.0
Mean :1089 Mean : 517.8
3rd Qu.:1734 3rd Qu.: 782.0
Max. :1797 Max. :1041.0
summary(Buildings)
buildingId location buildingType
Min. : 1.0 Length:1042 Length:1042
1st Qu.: 261.2 Class :character Class :character
Median : 521.5 Mode :character Mode :character
Mean : 521.5
3rd Qu.: 781.8
Max. :1042.0
maxOccupancy units
Min. : 1.00 Length:1042
1st Qu.: 5.00 Class :character
Median : 7.00 Mode :character
Mean : 15.51
3rd Qu.: 12.00
Max. :418.00
NA's :539
summary(Jobs)
jobId employerId hourlyRate startTime
Min. : 0.0 Min. : 379 Min. : 10.00 Length:1328
1st Qu.: 331.8 1st Qu.: 438 1st Qu.: 10.03 Class1:hms
Median : 663.5 Median : 884 Median : 14.74 Class2:difftime
Mean : 663.5 Mean :1059 Mean : 19.13 Mode :numeric
3rd Qu.: 995.2 3rd Qu.:1337 3rd Qu.: 23.32
Max. :1327.0 Max. :1797 Max. :100.00
endTime daysToWork educationRequirement
Length:1328 Length:1328 Length:1328
Class1:hms Class :character Class :character
Class2:difftime Mode :character Mode :character
Mode :numeric
summary(Checkin)
participantId timestamp venueId
Min. : 0.0 Min. :2022-03-01 05:35:00 Min. : 0
1st Qu.: 221.0 1st Qu.:2022-06-10 18:30:00 1st Qu.: 449
Median : 464.0 Median :2022-10-03 20:25:00 Median : 910
Mean : 480.5 Mean :2022-10-05 07:41:29 Mean :1015
3rd Qu.: 726.0 3rd Qu.:2023-01-28 08:10:00 3rd Qu.:1358
Max. :1010.0 Max. :2023-05-25 00:05:00 Max. :1805
venueType
Length:2100635
Class :character
Mode :character
Employers dataset is joined with Buildings dataset
based on buildingId to filter only relevant information
from Buildings dataset in regards to employers. Left join on
Employers is used as commercial buildings are a subset of the
different type of buildings.
Employers <- Employers %>% left_join(Buildings,by="buildingId")
Next, we use outer join on Employers and Jobs based
on employerId to have a full overview on all the jobs that
are offered by each employer.
Employers = merge(x=Employers,y=Jobs,by="employerId",all=TRUE)
First, we start by renaming the columns and values of in
Employers and Checkin dataset using the function rename(),
and sub()
for a better format and ease of reading.
Note: A check between both datasets shows that venueId
in Checkin dataset refers to the employerId,
pubId etc. For the purpose of this exercise, we are only
interested in the employerId (venueType =
Workspace) and other venues type will be removed
subsequently.
# rename columns
Employers <- Employers %>%
rename('Employer_ID' = 'employerId',
'Location(Pt)' = 'location.x',
'Location(Area)' = 'location.y',
'Building_ID' = 'buildingId',
'Building_Type' = 'buildingType',
'Max_Occupancy' = 'maxOccupancy',
'Units' = 'units',
'Job_ID' = 'jobId',
'Hourly_Rate' = 'hourlyRate',
'Start_Time' = 'startTime',
'End_Time' = 'endTime',
'Days_To_Work' = 'daysToWork',
'Education_Level' = 'educationRequirement')
Buildings <- Buildings %>%
rename('Building_Type' = 'buildingType',
'Building_ID' = 'buildingId')
Checkin <- Checkin %>%
rename('Participant_ID' = 'participantId',
'Timestamp' = 'timestamp',
'Employer_ID' = 'venueId',
'Venue_Type' = 'venueType')
Jobs <- Jobs %>%
rename('Education_Level' = 'educationRequirement')
Participants <- Participants %>%
rename('Participant_ID' = 'participantId',
'Household_Size' = 'householdSize',
'Have_Kids' = 'haveKids',
'Age' = 'age',
'Education_Level' = 'educationLevel',
'Interest_Group' = 'interestGroup',
'Joviality' = 'joviality')
#rename row values
Employers$Education_Level <- sub('HighSchoolOrCollege',
'High School or College',
Employers$Education_Level)
Jobs$Education_Level <- sub('HighSchoolOrCollege',
'High School or College',
Jobs$Education_Level)
Participants$Education_Level <- sub('HighSchoolOrCollege',
'High School or College',
Participants$Education_Level)
We are using Checkin to see the changes in employment
(i.e. checkin by participants at workplace) over time. We see that the
venueId column in the dataset are IDs of all possible
venues such as work place, restaurants and pubs.
Given that we are only interested in workplace, we will first co the
frequency get the subset of row that reads “workplace” using
grep(). the this column as buildingId and left
join with Employers.
Compute frequency count of returns by sub-category Then, the frequency count of returns by sub-category is computed by using the group_by method found in dplyr.
GROUP_BY METHOD#Extract the date from timestamp
Checkin$Date <- as.Date(Checkin$Timestamp)
#Fliter rows with workplace as value
Workplace_Checkin <- Checkin[grep("Workplace", Checkin$Venue_Type),]
#Compute count frequency of participants by date in each venue
Count_Checkin <- Workplace_Checkin %>%
group_by(Date, Employer_ID) %>%
summarise('Num_of_Employees'= n_distinct(Participant_ID))
Dataframe Employers, Checkin, Count_Checkin are saved and read in RDS format to avoid uploading large files to Git.
Employer_ID Location(Pt)
1 379 POINT (-1849.997168394888 1744.6010147106394)
2 379 POINT (-1849.997168394888 1744.6010147106394)
3 380 POINT (41.51783767879146 418.7264799744545)
4 380 POINT (41.51783767879146 418.7264799744545)
5 381 POINT (877.2786575380362 1358.5441805909259)
6 381 POINT (877.2786575380362 1358.5441805909259)
Building_ID
1 823
2 823
3 154
4 154
5 279
6 279
Location(Area)
1 POLYGON ((-1827.4692833905406 1695.6576709214576, -1867.4171292489727 1693.6157083187259, -1872.5250533992355 1793.544358499821, -1832.577207540803 1795.5863211025528, -1827.4692833905406 1695.6576709214576))
2 POLYGON ((-1827.4692833905406 1695.6576709214576, -1867.4171292489727 1693.6157083187259, -1872.5250533992355 1793.544358499821, -1832.577207540803 1795.5863211025528, -1827.4692833905406 1695.6576709214576))
3 POLYGON ((70.42637822497251 553.0041088419151, 67.51462730185047 375.56179881584416, 52.18537635533121 151.5122512694542, 12.280631158043583 154.2711116754102, 27.554342810484837 377.50215900919795, 28.052028476193776 684.9788325328564, 68.04559160688791 685.6964035127485, 70.42637822497251 553.0041088419151))
4 POLYGON ((70.42637822497251 553.0041088419151, 67.51462730185047 375.56179881584416, 52.18537635533121 151.5122512694542, 12.280631158043583 154.2711116754102, 27.554342810484837 377.50215900919795, 28.052028476193776 684.9788325328564, 68.04559160688791 685.6964035127485, 70.42637822497251 553.0041088419151))
5 POLYGON ((905.3736148660081 1326.7099056312004, 884.0624749521975 1328.7138756879922, 884.2837099476344 1315.5041922421924, 844.2893186381363 1314.834368546934, 842.8213512918865 1402.4849860020697, 912.1085496022998 1398.3323013796062, 905.3736148660081 1326.7099056312004))
6 POLYGON ((905.3736148660081 1326.7099056312004, 884.0624749521975 1328.7138756879922, 884.2837099476344 1315.5041922421924, 844.2893186381363 1314.834368546934, 842.8213512918865 1402.4849860020697, 912.1085496022998 1398.3323013796062, 905.3736148660081 1326.7099056312004))
Building_Type Max_Occupancy Units Job_ID Hourly_Rate Start_Time
1 Commercial NA [379] 0 10.00000 07:46:00
2 Commercial NA [379] 1 22.21763 07:31:00
3 Commercial NA [380] 2 10.00000 08:00:00
4 Commercial NA [380] 3 15.31207 07:39:00
5 Commercial NA [381] 4 21.35541 07:53:00
6 Commercial NA [381] 5 12.09383 08:13:00
End_Time Days_To_Work
1 15:46:00 [Monday,Tuesday,Wednesday,Thursday,Friday]
2 15:31:00 [Monday,Tuesday,Wednesday,Thursday,Friday]
3 16:00:00 [Monday,Tuesday,Wednesday,Thursday,Friday]
4 15:39:00 [Monday,Tuesday,Wednesday,Thursday,Friday]
5 15:53:00 [Monday,Tuesday,Wednesday,Thursday,Friday]
6 16:13:00 [Monday,Sunday,Thursday,Tuesday,Saturday]
Education_Level
1 High School or College
2 Bachelors
3 Bachelors
4 Bachelors
5 High School or College
6 High School or College
saveRDS(Workplace_Checkin, 'data/Workplace_Checkin.rds')
Workplace_Checkin <- readRDS('data/Workplace_Checkin.rds')
head(Workplace_Checkin)
# A tibble: 6 x 5
Participant_ID Timestamp Employer_ID Venue_Type Date
<dbl> <dttm> <dbl> <chr> <date>
1 120 2022-03-01 07:00:00 382 Workplace 2022-03-01
2 382 2022-03-01 07:00:00 846 Workplace 2022-03-01
3 517 2022-03-01 07:00:00 1339 Workplace 2022-03-01
4 783 2022-03-01 07:00:00 1745 Workplace 2022-03-01
5 789 2022-03-01 07:00:00 1763 Workplace 2022-03-01
6 949 2022-03-01 07:00:00 1790 Workplace 2022-03-01
saveRDS(Count_Checkin, 'data/Count_Checkin.rds')
Count_Checkin <- readRDS('data/Count_Checkin.rds')
head(Count_Checkin)
# A tibble: 6 x 3
# Groups: Date [1]
Date Employer_ID Num_of_Employees
<date> <dbl> <int>
1 2022-03-01 379 1
2 2022-03-01 381 4
3 2022-03-01 382 5
4 2022-03-01 383 6
5 2022-03-01 384 2
6 2022-03-01 385 5
Next, we put up a series of charts to address the question.
Employers <- Employers %>% mutate(Duration = End_Time - Start_Time)
Employers <-Employers %>% mutate(across(c(Duration), as.numeric))
Employers <- Employers %>% mutate(Weekly_Wages = Duration/ 60 / 60 * Hourly_Rate)
Employers$Employer_ID <- as.character(Employers$Employer_ID)
Employers$Building_ID <- as.character(Employers$Building_ID)
ggplot(Employers, aes(Employer_ID, Education_Level, fill = Hourly_Rate)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "yellow", high = "purple") +
labs(title ="Growth of Employment by States and NIC", x = "Employer_ID", y = "Age Group", fill = "Hourly_Rate")+
theme(axis.text.x = element_text(angle = 0, hjust = 1.0))
Count_Checkin$Employer_ID <- as.character(Count_Checkin$Employer_ID)
p<- ggplot(Count_Checkin, aes(x=Employer_ID, y=Num_of_Employees)) +
geom_bar(stat="identity") +
ylim(1,15)+
coord_polar()+
transition_time(Date) +
labs(title = "Date: {frame_time}")
animate(p, duration = 274,height = 800, width =800)
Workplace_Checkin <- Workplace_Checkin %>% left_join(Participants,by="Participant_ID")
Count_Work_AgeGroup <- Workplace_Checkin %>%
group_by(Date, Employer_ID, Participant_ID) %>%
summarise('Num_of_Employees'= n_distinct(Participant_ID))
Count_Work_AgeGroup <- Count_Work_AgeGroup %>% left_join(Participants,by="Participant_ID")
Count_Work_AgeGroup <- Workplace_Checkin %>%
group_by(Date, Employer_ID, Age_Group) %>%
summarise('Num_of_Employees'= n_distinct(Participant_ID))
q<- Count_Work_AgeGroup$Employer_ID <- as.character(Count_Work_AgeGroup$Employer_ID)
ggplot(Count_Work_AgeGroup, aes(Employer_ID, Age_Group,fill = Num_of_Employees)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "green", high = "red") +
labs(title ="Growth of Employment by States and NIC", x = "Employer_ID", y = "Age Group", fill = "No. of Employees")+
theme(axis.text.x = element_text(angle = 0, hjust = 1.0))+
transition_time(Date) +
labs(title = "Date: {frame_time}")